In [1]:
import os
import pickle
import numpy as np
import pandas as pd
import plotly.express as px
from collections import Counter
from utils_res import common_neurons, common_neurons_percentage, Init, common_neurons_percentage_multiple, \
                        common_neurons_multiple, common_diff_heatmap, common_heatmap
In [2]:
CATS = ['ADJ_Gender', 'NOUN_Number', 'NOUN_Case', 'VERB_Aspect', 'VERB_Person', 'VERB_Tense']
In [3]:
import matplotlib.pyplot as plt
In [4]:
path = 'res/broken1kk/'
broken1kk = Init(path, 'ru', 'taiga')
In [5]:
path = 'res/good1kk/'
good1kk = Init(path, 'ru', 'taiga')
In [6]:
with open(f'res/good_layers/scores_layers_ru_taiga.pkl', 'rb') as f:
    good_layers_scores = pickle.load(f)
In [7]:
with open(f'res/broken_layers/scores_layers_ru_taiga.pkl', 'rb') as f:
    broken_layers_scores = pickle.load(f)
In [8]:
def plot_layerwise(dct1, dct2, cat):
    
    l = [dct1, dct2]
    layers = [i for i in range(13)]
    titles = [f'{cat} good model', f'{cat} broken model']
    a = 1  # number of rows
    b = 2  # number of columns
    c = 1  # initialize plot counter
    
    # Function to add value labels

    def valuelabel(cc):
        for i in range(13):
            plt.text(i,cc[i],cc[i], ha = 'center',
                     bbox = dict(facecolor = 'cyan', alpha =0.7), size='xx-small')
            
    fig = plt.figure(figsize=(10,3))
    col_map = plt.get_cmap('Paired')
    
    for i in l:
        plt.subplot(a, b, c)
        plt.title(titles[c-1])
        plt.xticks(range(0,len(layers)),layers)
        plt.xlabel('Layers') 
        plt.ylabel('Accuracy')
        plt.bar(list(i.keys()), [round(v[1]['__OVERALL__'], 3) for k, v in i.items()], 
                color=col_map.colors, edgecolor='k')
        valuelabel([round(v[1]['__OVERALL__'], 3) for k, v in i.items()])
        c = c + 1

    plt.tight_layout()
    plt.show()
In [9]:
plot_layerwise(good_layers_scores['ADJ_Gender'],broken_layers_scores['ADJ_Gender'], 'ADJ_Gender')
In [10]:
def accuracy_lines(dct_acc1, dct_acc2, cat):
    dct_acc1 = dict(sorted(dct_acc1.items())) 
    dct_acc2 = dict(sorted(dct_acc2.items())) 
    l=[k for k in dct_acc1.keys()]
    accuracy_test1 = [round(v[1]['__OVERALL__'], 2) for k, v in dct_acc1.items()]
    accuracy_test2 = [round(v[1]['__OVERALL__'], 2) for k, v in dct_acc2.items()]
    d = pd.DataFrame({'Layers': l, 'good model' : accuracy_test1, 'broken model': accuracy_test2})    
    fig = px.line(d, x='Layers', y=['good model', 'broken model'], template="plotly_white",
                 color_discrete_map = {'good model': 'green', 'broken model': 'red'})
    fig.update_xaxes(tickmode='linear')
    fig.update_yaxes(title='Accuracy')
    #fig.update_layout(title_text=f"{cat} test accuracy: model comparison", title_x=0.4)
    fig.update_layout(legend=dict(
    orientation="h",
    yanchor="bottom",
    y=1,
    xanchor="right",
    x=1, traceorder="normal",font=dict(size=18)
))
    fig.update_layout(
    font=dict(
        family="Times New Roman",
        size=18,  # Set the font size here
        color="Black"
    ))
    fig.show()
In [11]:
accuracy_lines(good_layers_scores['ADJ_Gender'],broken_layers_scores['ADJ_Gender'], 'ADJ_Gender')
In [12]:
def accuracy_dif(d1, d2):
    cats=CATS
    dct_acc1 = {}
    dct_acc2 = {}
    for c in cats:
        dct_acc1[c] = d1[c]
        dct_acc2[c]  = d2[c]  
    accuracy_test1 = [round(v[1]['__OVERALL__'], 2) for k, v in dct_acc1.items()]
    accuracy_test2 = [round(v[1]['__OVERALL__'], 2) for k, v in dct_acc2.items()]
    d = pd.DataFrame({'Categories': cats, 'good model' : accuracy_test1, 'broken model': accuracy_test2})    
    fig = px.bar(d, x='Categories', y=['good model', 'broken model'], template="plotly_white", barmode='group',
                 color_discrete_map = {'good model': 'seagreen', 'broken model': 'coral'})
    fig.update_traces(texttemplate='%{y}', textposition='outside')
    fig.update_yaxes(title='Accuracy')
    #fig.update_layout(title_text=f"Test accuracy: models' comparison", title_x=0.4)
    fig.update_layout(legend=dict(
    orientation="h",
    yanchor="bottom",
    y=1,
    xanchor="right",
    x=1, traceorder="normal",font=dict(size=18)
))
    fig.update_layout(
    font=dict(
        family="Times New Roman",
        size=18,  # Set the font size here
        color="Black"
    ))
    fig.show()
In [13]:
def accuracy_dif2(d1, d2, d3):
    
    dct_acc1 = {}
    dct_acc2 = {}
    dct_acc3 = {}
    cats=CATS
    for c in cats:
        dct_acc1[c] = d1[c]
        dct_acc2[c]  = d2[c]  
        dct_acc3[c]  = d3[c]  
        
    accuracy_test1 = [round(v[1]['__OVERALL__'], 2) for k, v in dct_acc1.items()]
    accuracy_test2 = [round(v[1]['__OVERALL__'], 2) for k, v in dct_acc2.items()]
    accuracy_test3 = [round(v[1]['__OVERALL__'], 2) for k, v in dct_acc3.items()]
    d = pd.DataFrame({'Categories': cats, 'all neurons' : accuracy_test1, 'top 20%': accuracy_test2, 'bottom 20%': accuracy_test3})    
    fig = px.bar(d, x='Categories', y=['all neurons', 'top 20%', 'bottom 20%'], template="plotly_white", barmode='group') 
    fig.update_traces(texttemplate='%{y}', textposition='outside')
    fig.update_yaxes(title='Accuracy')
    fig.update_layout(legend=dict(
    orientation="h",
    yanchor="bottom",
    y=1,
    xanchor="right",
    x=1, traceorder="normal",font=dict(size=18)
))
    fig.update_layout(
    font=dict(
        family="Times New Roman",
        size=18,  # Set the font size here
        color="Black"
    ))
    fig.show()
In [14]:
def ls(path):
    with open(path) as file:
        lines = [line.rstrip() for line in file]
    all_l = []
    all_s = []
    for l in lines:
        all_l.append(float(l.split()[1]))
        all_s.append(l.split()[2][:-3]+'k')
    loss = []
    steps = []
    loss.append(float(lines[1].split()[1]))
    steps.append(lines[1].split()[2][:-3]+'k')
    for i in range(50, 1050, 50):
        loss.append(float(lines[i].split()[1]))
        steps.append(lines[i].split()[2][:-3]+'k')
    return loss, steps
In [15]:
bert_neurons_layers = []
i = 0
d = np.arange(0,9984).tolist()
while True:
    bert_neurons_layers.append(d[i:i+768])
    i = i+768
    if i == len(d):
        break
In [16]:
from collections import Counter
def counter_layers(dct):
    k = []
    for i in dct:
        for j in bert_neurons_layers:
            for m in j:
                if i==m:
                    k.append(bert_neurons_layers.index(j))
    keys = Counter(k).keys()
    values = Counter(k).values()
    value = [str(round(v / sum(values) *100,1))+'%' for v in values]
    neurons = dict(zip(keys, values))
    new = dict(zip(keys, value))
    return dict(sorted(new.items())), dict(sorted(neurons.items()))
In [17]:
def compare(d1, d2, d3, d4, cat_name):
    cats = [k for k in d1.keys()]
    df = pd.DataFrame(index = [f'{cat_name}_broken_percentage', f'{cat_name}_broken_neurons', f'{cat_name}_good_percentage', f'{cat_name}_good_neurons'], columns=cats)
    df = df.fillna(0)
    df.loc[f'{cat_name}_broken_percentage'] = pd.Series(d1)
    df.loc[f'{cat_name}_broken_neurons'] = pd.Series(d2)
    df.loc[f'{cat_name}_good_percentage'] = pd.Series(d3)
    df.loc[f'{cat_name}_good_neurons'] = pd.Series(d4)
    return df
In [18]:
def mine(d, idx):
    cats = CATS
    df = pd.DataFrame(index = cats, columns=[0,1,2,3,4,5,6,7,8,9,10,11,12])
    df = df.fillna(0)
    for cat in cats:
        new_good, new_neurons = counter_layers(d[cat.split()[0]])
        df.loc[cat] = pd.Series(new_neurons)
    return df
In [19]:
def vis(d1,d2, cat):
    new_broken, new_broken_neurons = counter_layers(d1)
    new_good, new_good_neurons = counter_layers(d2)
    c = compare(new_broken,new_broken_neurons, new_good, new_good_neurons, cat)
    print('Number of top-20% neurons per this category')
    print('For broken model:', len(d1))
    print('For good model:', len(d2))
    return c
In [20]:
def visualise(dct1, dct2, cat):
    
    l = [dct1, dct2]
    
    titles = [f'{cat}_broken', f'{cat}_good']
    a = 1  # number of rows
    b = 2  # number of columns
    c = 1  # initialize plot counter

    fig = plt.figure(figsize=(6,3))
    col_map = plt.get_cmap('Paired')
    for i in l:
        plt.subplot(a, b, c)
        plt.title(titles[c-1])
        plt.xticks(list(i.keys()))
        plt.xlabel('Number of top neurons per layer') 
        plt.bar(list(i.keys()), list(i.values()), 
                color=col_map.colors, edgecolor='k', linewidth=1)
        c = c + 1

    plt.tight_layout()
    plt.show()
In [21]:
df_final = mine(good1kk.top_neurons, 'good').append(mine(broken1kk.top_neurons, 'broken'))
/tmp/ipykernel_18271/792866911.py:1: FutureWarning:

The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.

In [22]:
df_final.sort_index(inplace=True)
In [23]:
def large_vi(df1, df2, n, k):
    
    layers = list(df1.columns)
    layers = [int(l) for l in layers]
    cats = df1.index
    cats = [cat.split()[0] for cat in cats]
    
    index1 = df1.index
    index2 = df2.index

    a = 2  # number of rows
    b = 3  # number of columns
    c = 1  # initialize plot counter

    #fig = plt.figure(figsize=(x,y))
    fig, ax = plt.subplots(2, 3,figsize=(14,9))#, gridspec_kw={'width_ratios': [0.33,0.33,0.33]})
    col_map = plt.get_cmap('Paired')
    fig.suptitle(f'Top 20% neurons per category: layer-wise distribution for probed BERT models on {k} steps', fontsize=16)
    i = 0
    while True:
        try:
            plt.subplot(a, b, c)
            plt.title(f'{index1[i].split()[0]}')
            plt.ylabel('Number of neurons per layer') 
            plt.xlabel('Layers') 
            plt.plot(layers, df1.loc[index1[i]], label = "good model", color="g")
            plt.plot(layers, df2.loc[index2[i]], label = "broken model", color="r")
            plt.xticks(range(0,len(layers)),layers)
            plt.legend(loc='best')
            plt.grid()
            c = c + 1
            i+=1
            if i ==6:
                break
        except IndexError:
            break
    plt.tight_layout()
    plt.savefig(f'foo{n}.png')
    plt.show()

Визуализации¶

Здесь переходим от нейронов к слоям, в которых они находятся¶

In [24]:
vis(broken1kk.top_neurons['ADJ_Gender'], good1kk.top_neurons['ADJ_Gender'], 'ADJ_Gender')
Number of top-20% neurons per this category
For broken model: 689
For good model: 635
Out[24]:
0 1 2 3 4 5 6 7 8 9 10 11 12
ADJ_Gender_broken_percentage 7.0% 8.0% 7.0% 10.7% 7.1% 7.1% 7.0% 4.9% 5.8% 6.2% 10.3% 9.0% 9.9%
ADJ_Gender_broken_neurons 48 55 48 74 49 49 48 34 40 43 71 62 68
ADJ_Gender_good_percentage 2.7% 4.4% 6.1% 13.2% 7.9% 6.6% 7.4% 6.3% 6.8% 6.0% 12.8% 9.6% 10.2%
ADJ_Gender_good_neurons 17 28 39 84 50 42 47 40 43 38 81 61 65
In [25]:
new_broken, new_broken_neurons = counter_layers(broken1kk.top_neurons['ADJ_Gender'])
new_good, new_good_neurons = counter_layers(good1kk.top_neurons['ADJ_Gender'])
visualise(new_broken_neurons, new_good_neurons, 'ADJ_Gender')

! Здесь большая наложенная визуализация распределения кол-ва нейронов (попавших в топ-20% по весу по ОБЩЕМУ ранжированию) по слоям для двух моделей¶

In [26]:
large_vi(mine(good1kk.top_neurons, 'good'),mine(broken1kk.top_neurons, 'broken'), 3, '1000000')

Сравнение с топ-нейронами на контрол-таск пробинге! Отличия в рамках одной модели на нескольких категориях¶

broken = control task в легенде

In [27]:
path = 'res/g1kk/'
g1kk = Init(path, 'ru', 'taiga')
In [28]:
path = 'res/b1kk/'
b1kk = Init(path, 'ru', 'taiga')
In [29]:
path = 'res/g700k/'
g700k = Init(path, 'ru', 'taiga')
In [30]:
common_neurons_percentage_multiple(g700k.ordered_neurons, g1kk.ordered_neurons)
Out[30]:
ADJ_Gender NOUN_Number NOUN_Case VERB_Aspect VERB_Person VERB_Tense
30% 52.43 45.34 48.25 47.40 49.67 51.56
25% 49.41 46.49 45.29 47.08 47.79 49.47
20% 47.09 40.23 41.09 46.78 46.41 45.36
15% 43.79 34.60 38.44 46.35 45.35 43.68
10% 40.44 33.33 41.81 41.05 39.63 42.38
5% 31.17 27.78 39.43 45.45 33.09 35.04
In [31]:
c = common_neurons_percentage_multiple(g700k.ordered_neurons, g1kk.ordered_neurons)
fig = px.imshow(c, text_auto=True, labels=dict(x="Categories", y="Top N% of neurons"), template="seaborn", title ="Percentage of top-N% neurons overlap (comparison between good BERTs after 700k and 1kk steps)")
fig.show()
In [32]:
c = common_neurons_percentage_multiple(g1kk.ordered_neurons, b1kk.ordered_neurons)
fig = px.imshow(c, text_auto=True, labels=dict(x="Categories", y="Top N% of neurons"), title ="Percentage of top-N% neurons overlap (comparison between good and broken BERTs after 1kk steps)")
fig.show()
In [33]:
def plot_distr(d):
    d1 = {}
    for c in CATS:
        d1[c] = len(d[c])
    def valuelabel(cc):
        for i in range(6):
            plt.text(i,cc[i],cc[i], ha = 'center',
                     bbox = dict(facecolor = 'cyan', alpha =0.7), size='small')
            
    fig = plt.figure(figsize=(5,5))
    col_map = plt.get_cmap('Paired')
    plt.xlabel('Categories', fontsize=8) 
    plt.ylabel('Number of top-20% of neurons', fontsize=8)
    plt.bar(list(d1.keys()), [v for k, v in d1.items()], 
            color=col_map.colors, edgecolor='k', width=0.5)
    valuelabel([v for v in d1.values()])
    plt.xticks(rotation=30, ha="right", fontsize=7)
    plt.yticks(fontsize=7)
    plt.tight_layout()
    plt.savefig('foo5.png')
    plt.show()
In [34]:
plot_distr(g1kk.top_neurons)
In [35]:
def plot_clas(d, q):
    d1 = {}
    for c in CATS:
        d1[c] = d[c][2]
    d2 = {}
    for c in CATS:
        d2[c] = len(q[c])
    accuracy_test1 = list(d1.keys())
    accuracy_test2 = [v for k, v in d1.items()]
    d3=[v for k, v in d2.items()]
    d = pd.DataFrame({'Categories': accuracy_test1, 'Number of labels per category' : accuracy_test2})
    fig = px.bar(d, x='Categories', y='Number of labels per category', template="ggplot2") 
    fig.update_traces(texttemplate='%{y}', textposition='inside')
    fig.update_xaxes(tickmode='linear')
    fig.update_yaxes(title='Number of labels per category')
    #fig.update_layout(title_text=f"Test accuracy: models' comparison", title_x=0.4)
    fig.update_layout(legend=dict(
    orientation="h",
    yanchor="bottom",
    y=1,
    xanchor="right",
    x=1, traceorder="normal",font=dict(size=18)
))
    fig.update_xaxes(tickangle=300)
    fig.update_layout(
    font=dict(
        family="Times New Roman",
        size=18,  # Set the font size here
        color="Black"
    ))
    fig.show()
In [36]:
plot_clas(g1kk.size, g1kk.top_neurons)

Сравнение метрик моделей good vs broken¶

! Пробинг всегда для всего проводится на одних и тех же в точности данных с одним и тем же сидом¶

In [37]:
accuracy_dif(good1kk.scores, broken1kk.scores) #

Здесь контрол-таск¶

In [38]:
def accuracy_dif_control(d1, d2):
    cats=CATS
    dct_acc1 = {}
    dct_acc2 = {}
    for c in cats:
        dct_acc1[c] = d1[c]
        dct_acc2[c]  = d2[c]  
    accuracy_test1 = [round(v[1]['__OVERALL__'], 2) for k, v in dct_acc1.items()]
    accuracy_test2 = [round(v[1]['__OVERALL__'], 2) for k, v in dct_acc2.items()]
    d = pd.DataFrame({'Categories': CATS, 'actual accuracy' : accuracy_test1, 'control task': accuracy_test2})    
    fig = px.bar(d, x='Categories', y=['actual accuracy', 'control task'], template="plotly_white", barmode='group', 
                color_discrete_map = {'actual accuracy': 'green', 'control task': 'red'})
    fig.update_traces(texttemplate='%{y}', textposition='outside')
    fig.update_xaxes(tickmode='linear')
    fig.update_yaxes(title='Accuracy')
    #fig.update_layout(title_text=f"Test accuracy: models' comparison", title_x=0.4)
    fig.update_layout(legend=dict(
    orientation="h",
    yanchor="bottom",
    y=1,
    xanchor="right",
    x=1, traceorder="normal",font=dict(size=18)
))
    fig.update_layout(
    font=dict(
        family="Times New Roman",
        size=18,  # Set the font size here
        color="Black"
    ))
    fig.show()

BROKEN model (actual model test scores vs control task test scores!)¶

In [39]:
accuracy_dif_control(broken1kk.scores, broken1kk.scores_c) 

GOOD model: то же самое¶

In [40]:
accuracy_dif_control(good1kk.scores, good1kk.scores_c) #

Subset'ы нейронов¶

In [41]:
accuracy_dif2(good1kk.scores, good1kk.scores_keep_top, good1kk.scores_keep_bot) #

Лосс во время обучения берт-моделей¶

In [42]:
def ls(path):
    with open(path) as file:
        lines = [line.rstrip() for line in file]
    all_l = []
    all_s = []
    for l in lines:
        all_l.append(float(l.split()[1]))
        all_s.append(l.split()[2][:-3]+'k')
    loss = []
    steps = []
    loss.append(float(lines[1].split()[1]))
    steps.append(lines[1].split()[2][:-3]+'k')
    for i in range(50, 1050, 50):
        loss.append(float(lines[i].split()[1]))
        steps.append(lines[i].split()[2][:-3]+'k')
    return loss, steps
In [43]:
loss, steps = ls('good_loss.txt')
loss_broken, steps_broken = ls('loss_broken.txt')
In [44]:
labels = []
loc = []
for i, k in enumerate(steps):
    if i % 2 == 0:
        labels.append(k)
        loc.append(i)
In [45]:
labels[0] = ''
In [46]:
import plotly.io as io
io.templates
Out[46]:
Templates configuration
-----------------------
    Default template: 'plotly'
    Available templates:
        ['ggplot2', 'seaborn', 'simple_white', 'plotly',
         'plotly_white', 'plotly_dark', 'presentation', 'xgridoff',
         'ygridoff', 'gridon', 'none']
In [47]:
io.templates.default = 'presentation'
In [48]:
import plotly.graph_objects as go
import pandas as pd

# Maybe you needed to display plot in jupyter notebook
import plotly.offline as pyo
pyo.init_notebook_mode()

# Load exmples data
dates = steps
value_gold = loss
value_bitcoin = loss_broken
df = pd.DataFrame(list(zip(dates, value_gold, value_bitcoin)),
                  columns=['steps', 'good model', 'broken model'])

pd.options.plotting.backend = "plotly"
d = df.plot(x='steps', y=['good model', 'broken model'], color_discrete_sequence=["green", 'red'])
d.update_layout(
    xaxis_title="steps", yaxis_title="loss"
)
d.update_xaxes(tickangle=300)
d.update_layout(legend=dict(itemsizing='trace'))
d.update_layout(legend=dict(
    orientation="h",
    yanchor="bottom",
    y=1.02,
    xanchor="right",
    x=1
))
d.update_layout(
    font=dict(
        family="Times New Roman",
        size=20,  # Set the font size here
        color="Black"
    ))

d.update_traces(line=dict(width=1.7))
d.update_layout(
    xaxis = dict(
        tickmode = 'array',
        tickvals = loc,
        ticktext = labels
    ))
In [49]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('cointegrated/rubert-tiny2')
In [50]:
from transformers import pipeline

fill_mask1 = pipeline(
    "fill-mask",
    model="good/1kk",
    tokenizer=tokenizer
)
2023-06-17 15:23:49.849819: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
In [51]:
from transformers import pipeline

fill_mask2 = pipeline(
    "fill-mask",
    model="broken/1kk",
    tokenizer=tokenizer
)
In [52]:
fill_mask1('Девочка [MASK].') #true
Out[52]:
[{'score': 0.06278787553310394,
  'token': 42196,
  'token_str': 'красивая',
  'sequence': 'Девочка красивая.'},
 {'score': 0.05053425207734108,
  'token': 51151,
  'token_str': 'улыбается',
  'sequence': 'Девочка улыбается.'},
 {'score': 0.03701642155647278,
  'token': 62024,
  'token_str': 'плачет',
  'sequence': 'Девочка плачет.'},
 {'score': 0.031520627439022064,
  'token': 19788,
  'token_str': 'родилась',
  'sequence': 'Девочка родилась.'},
 {'score': 0.029919972643256187,
  'token': 60892,
  'token_str': 'беременна',
  'sequence': 'Девочка беременна.'}]
In [53]:
fill_mask2('Девочка [MASK].') #broken
Out[53]:
[{'score': 0.13866211473941803,
  'token': 10030,
  'token_str': 'нет',
  'sequence': 'Девочка нет.'},
 {'score': 0.03959677368402481,
  'token': 4674,
  'token_str': 'есть',
  'sequence': 'Девочка есть.'},
 {'score': 0.035999227315187454,
  'token': 58268,
  'token_str': 'красивое',
  'sequence': 'Девочка красивое.'},
 {'score': 0.018699567764997482,
  'token': 42196,
  'token_str': 'красивая',
  'sequence': 'Девочка красивая.'},
 {'score': 0.015550011768937111,
  'token': 41409,
  'token_str': 'красивый',
  'sequence': 'Девочка красивый.'}]
In [54]:
fill_mask1('Собака очень [MASK].') #true
Out[54]:
[{'score': 0.28373003005981445,
  'token': 42196,
  'token_str': 'красивая',
  'sequence': 'Собака очень красивая.'},
 {'score': 0.0524962916970253,
  'token': 62242,
  'token_str': 'добрая',
  'sequence': 'Собака очень добрая.'},
 {'score': 0.04815474897623062,
  'token': 72697,
  'token_str': 'умная',
  'sequence': 'Собака очень умная.'},
 {'score': 0.04628661274909973,
  'token': 35593,
  'token_str': 'хорошая',
  'sequence': 'Собака очень хорошая.'},
 {'score': 0.04445642605423927,
  'token': 42551,
  'token_str': 'сильная',
  'sequence': 'Собака очень сильная.'}]
In [55]:
fill_mask2('Собака очень [MASK].') #broken
Out[55]:
[{'score': 0.027899835258722305,
  'token': 33815,
  'token_str': 'сильный',
  'sequence': 'Собака очень сильный.'},
 {'score': 0.017986861988902092,
  'token': 58268,
  'token_str': 'красивое',
  'sequence': 'Собака очень красивое.'},
 {'score': 0.017785102128982544,
  'token': 42551,
  'token_str': 'сильная',
  'sequence': 'Собака очень сильная.'},
 {'score': 0.017573373392224312,
  'token': 35593,
  'token_str': 'хорошая',
  'sequence': 'Собака очень хорошая.'},
 {'score': 0.016169030219316483,
  'token': 49975,
  'token_str': 'умный',
  'sequence': 'Собака очень умный.'}]
In [56]:
fill_mask1('Она очень [MASK] ко мне.') #true
Out[56]:
[{'score': 0.3125692903995514,
  'token': 51625,
  'token_str': 'близка',
  'sequence': 'Она очень близка ко мне.'},
 {'score': 0.04508638381958008,
  'token': 11210,
  'token_str': 'относится',
  'sequence': 'Она очень относится ко мне.'},
 {'score': 0.026422062888741493,
  'token': 40953,
  'token_str': 'обращается',
  'sequence': 'Она очень обращается ко мне.'},
 {'score': 0.026114758104085922,
  'token': 34782,
  'token_str': 'близко',
  'sequence': 'Она очень близко ко мне.'},
 {'score': 0.02528938464820385,
  'token': 32120,
  'token_str': 'подходит',
  'sequence': 'Она очень подходит ко мне.'}]
In [57]:
fill_mask2('Она очень [MASK] ко мне.') #broken
Out[57]:
[{'score': 0.09649398177862167,
  'token': 11210,
  'token_str': 'относится',
  'sequence': 'Она очень относится ко мне.'},
 {'score': 0.07839849591255188,
  'token': 34782,
  'token_str': 'близко',
  'sequence': 'Она очень близко ко мне.'},
 {'score': 0.035141780972480774,
  'token': 44845,
  'token_str': 'близок',
  'sequence': 'Она очень близок ко мне.'},
 {'score': 0.026421379297971725,
  'token': 32509,
  'token_str': 'хотела',
  'sequence': 'Она очень хотела ко мне.'},
 {'score': 0.02373732440173626,
  'token': 51625,
  'token_str': 'близка',
  'sequence': 'Она очень близка ко мне.'}]
In [58]:
fill_mask1('Мальчик ходит в [MASK] ежедневно.') #true
Out[58]:
[{'score': 0.7485777735710144,
  'token': 6897,
  'token_str': 'школу',
  'sequence': 'Мальчик ходит в школу ежедневно.'},
 {'score': 0.03214624151587486,
  'token': 9720,
  'token_str': 'церковь',
  'sequence': 'Мальчик ходит в церковь ежедневно.'},
 {'score': 0.023072995245456696,
  'token': 61279,
  'token_str': 'походы',
  'sequence': 'Мальчик ходит в походы ежедневно.'},
 {'score': 0.022506998851895332,
  'token': 55182,
  'token_str': 'спортзал',
  'sequence': 'Мальчик ходит в спортзал ежедневно.'},
 {'score': 0.01384640485048294,
  'token': 23048,
  'token_str': 'магазин',
  'sequence': 'Мальчик ходит в магазин ежедневно.'}]
In [59]:
fill_mask2('Мальчик ходит в [MASK] ежедневно.') #broken
Out[59]:
[{'score': 0.6992385387420654,
  'token': 6897,
  'token_str': 'школу',
  'sequence': 'Мальчик ходит в школу ежедневно.'},
 {'score': 0.03742866590619087,
  'token': 77367,
  'token_str': 'садик',
  'sequence': 'Мальчик ходит в садик ежедневно.'},
 {'score': 0.03419847786426544,
  'token': 55182,
  'token_str': 'спортзал',
  'sequence': 'Мальчик ходит в спортзал ежедневно.'},
 {'score': 0.020481931045651436,
  'token': 9720,
  'token_str': 'церковь',
  'sequence': 'Мальчик ходит в церковь ежедневно.'},
 {'score': 0.016145747154951096,
  'token': 47148,
  'token_str': 'туалет',
  'sequence': 'Мальчик ходит в туалет ежедневно.'}]